Set working directory

Make sure the working directory is set correctly!

setwd("/bfg/data/courses/seeded2017/WGBS/")

Using bsseq objects

The bsseq objects are organized per chromosome and contain all samples:

library(bsseq)
# read in single chromosome for all samples
meth_data_chr21 <- readRDS(file = "bsseq/chr21.rds")

They have been smoothed in advance, but still contain also the raw count values. Calling the object name alone gives us some metainformation:

meth_data_chr21
## An object of type 'BSseq' with
##   760888 methylation loci
##   7 samples
## has been smoothed with
##   BSmooth (ns = 70, h = 1000, maxGap = 100000000) 
## All assays are in-memory

We can check which samples are included in our object:

meth_data_chr21@colData@rownames
## [1] "BC-pano"    "BC-pano-D2" "BC-prim"    "BC-unt"     "BC-unt-D2" 
## [6] "PBMC"       "TC-prim-D2"

The actual number of 760888 CpGs for chromosome 21 is still high for demontration purpose, so we will reduce them and only use the first 100000 CpGs:

meth_data_chr21_trim <- meth_data_chr21[1:100000,]
meth_data_chr21_trim
## An object of type 'BSseq' with
##   100000 methylation loci
##   7 samples
## has been smoothed with
##   BSmooth (ns = 70, h = 1000, maxGap = 100000000) 
## All assays are in-memory

For some analysis we might need all data, so in principle it is possible to load all chromosomes for all samples into a list:

# in case of multiple chromosomes, lists are handy
meth_data_all <- list()
# TODO: change
for (chr in c(seq(1, 2), "X")) {
  message(paste0("Loading chr", chr))
  meth_data_all[[chr]] <- readRDS(file = paste0("bsseq/chr", chr, ".rds"))
}

Just keep the size in mind:

size <- object.size(meth_data_all)
print(size, units = "auto")
## 87 Mb

QC

Our in-house pipeline provides a QC summary collecting information from different tools such as: samtools flagstats or Picard.

Familiarize with the attributes. Which values would you find alarming?

library(data.table)
qc <- fread(file = "seeded_wgbs_qc.tsv", header = T, data.table = F)
qc

The file seeded_qc_chr1.tsv contains more information in the bisulfite conversion rate. Open it using a editor of your choice.

Let’s inspect the CpG coverage. This function calculates the CpG coverage based on the data we just loaded.

#'@description calculate the CpG coverage
#'@param covfirst: first coverage value
#'@param covlast: last coverage value
#'@return coverage_data_all
calcCpGCoverage <- function(bsseqObjectList = NA, covfirst = NA, covlast = NA) {
  message ("Calculating coverage...")
  wish_cov <- seq(covfirst, covlast)
  coverage_data_all <- list()
  for (chr in names(bsseqObjectList)) {
    coverage_data <- NA
    coverage_data_chr <- getCoverage(BSseq = bsseqObjectList[[chr]], type = "Cov", what = "perBase")

    for (sample in colnames(coverage_data_chr)) {
      table_per_chr_per_sample <- table( coverage_data_chr[,sample] )
      coverage_data_temp <- cbind(wish_cov,
                                  unname(table_per_chr_per_sample[wish_cov + 1]),
                                  rep(sample, length(wish_cov)),
                                  rep(chr, length(wish_cov))
      )
      colnames(coverage_data_temp) <- c("coverage", "num.positions", "sample", "chr")
      coverage_data_df <- as.data.frame(coverage_data_temp)
      coverage_data <- rbind(coverage_data, coverage_data_df)
    }

    coverage_data <- na.omit(coverage_data)
    coverage_data$coverage <- as.numeric(as.character(coverage_data$coverage))
    coverage_data$num.positions <- as.numeric(as.character(coverage_data$num.positions))
    coverage_data_all[[chr]] <- coverage_data
  }
  coverage_data_all
  return(coverage_data_all)
}
coverage_data_all <- calcCpGCoverage(bsseqObjectList = meth_data_all, covfirst = 1, covlast = 30)
## Calculating coverage...
coverage_data_all[["1"]]

CpG coverage for a single chromosome:

library(ggplot2)
chr <- "1"
ggplot(coverage_data_all[[chr]], aes(y=num.positions, x=coverage, col=sample)) +
      geom_line() +
      ggtitle(paste0("CpG coverage for chr", chr))

#'@description aggregate the coverage data
#'@param coverage_data_all
#'@return methpiperObject
calcAggregateCoverage <- function(coverage_data_all = NA, total_num_cpgs = NA) {
  message ("Aggregating coverage...")
  # aggregate
  new <- do.call(rbind, coverage_data_all)
  aggregatedCoverage <- aggregate(formula = num.positions ~ sample + coverage, data = new, FUN = sum)
  aggregatedCoverage_withChr <- aggregate(formula = num.positions ~ sample + coverage + chr, data = new, FUN = sum)
  # calc cumulative density function CDF
  coverage_data_cumsum_df <- NA
  for (sample in unique(aggregatedCoverage$sample)) {
    temp <- as.data.frame((cumsum(aggregatedCoverage[aggregatedCoverage$sample == sample,]$num.positions) / total_num_cpgs))
    temp <- cbind(temp, seq(1, dim(temp)[1]), rep(sample, dim(temp)[1]))
    colnames(temp) <- c("num.positions", "coverage", "sample")
    coverage_data_cumsum_df <- rbind(coverage_data_cumsum_df, temp)
  }
  return(na.omit(coverage_data_cumsum_df))
}
aggregatedCoverage <- calcAggregateCoverage(coverage_data_all = coverage_data_all, total_num_cpgs = 28162537)
aggregatedCoverage

Plot the coverage for all chromosomes:

ggplot(aggregatedCoverage, aes(y=num.positions/28162537, x=coverage, col=sample)) +
    geom_line() +
    ylab("fraction genomic CpGs") +
    ggtitle(paste0("CpG coverage for all chromosomes"))

Global methylation level

We can first check the global methylation level of our samples, calculating the mean beta value over all samples.

colMeans(getMeth(meth_data_chr21))
##    BC-pano BC-pano-D2    BC-prim     BC-unt  BC-unt-D2       PBMC 
##  0.7849186  0.7512391  0.7851276  0.7828623  0.7506774  0.7952328 
## TC-prim-D2 
##  0.7692945

Beta value distribution

In general it is useful to check the beta value distribution of each sample, most of the CpGs in the genome are methylated:

library(reshape2)
# extract the methylation values
meth_per_cpg <- getMeth(meth_data_chr21, type = "smooth")
# from wide to long format
meth_per_cpg_melted <- melt(meth_per_cpg)
# plot "histogram" (frequency polygons) of beta values for each sample 
ggplot(meth_per_cpg_melted, aes(x=value, color=Var2)) +
  geom_freqpoly(binwidth = 0.01) +
  facet_wrap( ~Var2, ncol = 2)

Calling DMRs using DSS

Now we would like to find Differentially Methylated Regions (DMRs) for two groups of samples. Let’s say B cells vs peripheral blood mononuclear cells (PBMC), so we would like to compare the samples: BC-prim vs PBMC, we define these groups as:

groupA <- c("BC-prim")
groupB <- c("TC-prim-D2")

The order of groupA and groupB will not matter for testing DMLs/DMRs but will be important what we will call hyper- or hypomethylated and is reflected as + or - beta value.

DSS expects bsseq objects so we can directly plug that in and start calling Differentially Methylated Loci (DMLs) using our smoothed beta values:

library(DSS)

dmlTest <- DMLtest(meth_data_chr21_trim[,c(groupA, groupB)], group1 = groupA, group2 = groupB, smoothing = TRUE)
## Smoothing ...
## Estimating dispersion for each CpG site, this will take a while ...
dmls <- callDML(dmlTest)

Or directly call DMRs:

dmrs <- callDMR(dmlTest)
head(dmrs)

Let’s switch to the full list of DMRs (also all chromosomes) and the original dataset:

library(data.table)
dmrs <- fread(file = "BC-primary_vs_PBMC_allChromosomes.tsv", header = T, data.table = F)

What is the distribution of DMRs in terms of beta value?

hist(dmrs$diff.Methy, breaks = 100)

How does the number of CpGs in DMRs vs the change in methylation look like?

plot(dmrs$diff.Methy, dmrs$nCG)

hist(dmrs$length, breaks = 100)

hist(dmrs$nCG, breaks = 100)

Excercise: How many and which DMRs are hypomethylated or hypermethylated?

## [1] 36825     9
## [1] 9157    9

Excercise: Perform DMR calling for the comparison BC-panobinostat vs BC-untreatet in a similar way.

## Smoothing ...
## Estimating dispersion for each CpG site, this will take a while ...

Make sure to switch back to the comparison BC-primary vs PBMC:

groupA <- c("BC-prim")
groupB <- c("TC-prim-D2")
dmrs <- fread(file = "BC-primary_vs_PBMC_allChromosomes.tsv", header = T, data.table = F)

Genomic Ranges

Genomic Ranges (GRanges) are a popular representation of genomic regions for R. They allow similar operations similar to bedtools.

Here a short example to create a GRanges object:

regions <- GRanges(
  seqnames = c("1", "2"),
  ranges = IRanges(start = c(1000, 1500),
                   end = c(1300, 2000)
                   )
)
regions
## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]        1 1000-1300      *
##   [2]        2 1500-2000      *
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
as.data.frame(regions)

Annotation of DMRs

Of course it would be interesting to annotate our DMRs, answering for example: What is the next gene? How far is it away?

For this we are using GRanges but the same task could be done in bedtools:

library(rtracklayer)
# loading our annotation
hg19_refseq <- import("annotations/RefSeq_Nov15_2011_from_annovar_Genes_plain.bed.gz", format="bed")
# creating our GRanges from the dmr list
dmrs_gr <- GRanges(
  seqnames = dmrs$chr,
  ranges = IRanges(start = dmrs$start,
                   end = dmrs$end
  )
)
# look for the nearest gene from DMR
distanceToNearest <- distanceToNearest(dmrs_gr, hg19_refseq)
# append gene name and distance to our DMRs
dmrs_annotated <- cbind(dmrs, gene = hg19_refseq[distanceToNearest@to]$name, gene_distance = as.data.frame(distanceToNearest)$distance)

Exercise: How many DMRs are within a range of 5kb up- and downstream? How many outside 5kb?

## [1] 33282
## [1] 12700

Exercise: Let’s consider only DMRs in a 5kb distance to annotated genes. How many genes are associated to exactly 1 gene? How many to exactly 2, 3, 4… genes? Hint: table() is a useful function…

Exercise: The file annotations/cpgIslandExt_nochr.bed contains coordinates for CpG islands. Annotate the DMRs for CpG islands in a similar way and append the result to the dmrs_annotated object.

Visualization of DMRs

Plotting DMRs based on smoothed values:

bsseq also provides functions for plotting, independently which DMR caller has been used. Essential as input is a single DMR or multiple DMRs. Also multiple (custom) graphical annotations are possible, in fact any region of interest can be plotted this way.

Let’s plot a single DMR and extend the region 2000 bp up- and downstream:

plotRegion(meth_data_chr21[,c(groupA, groupB)], region=dmrs[dmrs$chr == 21,][1,], extend = 2000, addRegions = dmrs)

It would be nice to know also graphically where these differences occur in a gene, in a promotor, enhancer, gene body? For this we can modify our plot and add some annotations usually from open databases or based on other NGS datasets. These you can download, often as bed files for example from ENCODE, Refseq, UCSC browser, just to name a few. Ensure the annotations match the reference genome you use.

First load the the tracks using the package rtracklayer:

library(rtracklayer)
hg19_refseq <- import("annotations/RefSeq_Nov15_2011_from_annovar_Genes_plain.bed.gz", format="bed")
hg19_cgi <- import("annotations/CpGIslands_plain.bed.gz", format="bed")
hg19_Encode_enhancer <- import("annotations/encode_strong_enhancer_gap_1kb_nochr.bed.gz",format="bed")
hg19_Encode_TFBS <- import("annotations/encode_uniform_tfbs_merged_1k_nochr.bed.gz",format="bed")
hg19_Encode_LINE <- import("annotations/repeats_LINE_nochr.bed.gz",format="bed")
hg19_Encode_SINE <- import("annotations/repeats_SINE_nochr.bed.gz",format="bed")

We then combine the tracks in one object and pass it on to the plotting function:

annoTrack=c(GENE=hg19_refseq, CGI=hg19_cgi, Enhancer=hg19_Encode_enhancer, TFBS=hg19_Encode_TFBS, LINE=hg19_Encode_LINE, SINE=hg19_Encode_SINE)
plotRegion(meth_data_chr21[,c(groupA, groupB)], region=dmrs[dmrs$chr == 21,][1,], extend = 5000, annoTrack = annoTrack, addRegions = dmrs)

Finally adding colors and plot multiple DMRs, for example the first five DMRs:

meth_data_sub <- meth_data_chr21[,c(groupA, groupB)]
pData <- pData(meth_data_sub)
pData$col <- c("red", "blue")
pData
## DataFrame with 2 rows and 1 column
##                    col
##            <character>
## BC-prim            red
## TC-prim-D2        blue
pData(meth_data_sub) <- pData
plotManyRegions(meth_data_sub, regions=dmrs[dmrs$chr == 21,][1:5,], extend = 5000, annoTrack = annoTrack, addRegions = dmrs)
## [plotManyRegions] preprocessing ...done
## [plotManyRegions]   plotting region 1 (out of 5)

## [plotManyRegions]   plotting region 2 (out of 5)

## [plotManyRegions]   plotting region 3 (out of 5)

## [plotManyRegions]   plotting region 4 (out of 5)

## [plotManyRegions]   plotting region 5 (out of 5)

Exercise:

We have a region of interest, it is located on chromosome 21, starting at position 45377052 and ending at 45378087. Create a smoothed beta value plot for this region Hint: create a Genomic Range first.

Using smoothed values gives nice plots but one should be aware of low/no coverage regions, where there the smoothed beta value is still given. This can lead to artifacts. Another visualization is a heatmap of a region based on raw counts:

## Warning: package 'pheatmap' was built under R version 3.5.2

Here every column is a CpG, the distance between CpGs is not the same!

Beta value matrix of DMRs

bsseq provides a convinient function to generate a matrix of mean beta values for certain regions or DMRs. This beta value matrix can then be used for example in a PCA and/or heatmap.

library(bsseq)
chr <- "21"
regions <- dmrs[dmrs$chr==chr & abs(dmrs$diff.Methy) >= 0.3 & dmrs$nCG >=3,]
meth_values <- getMeth(BSseq = meth_data_chr21, type = "raw", regions = regions, what = "perRegion")
head(as.matrix(meth_values))
##        BC-pano BC-pano-D2   BC-prim    BC-unt BC-unt-D2      PBMC
## [1,] 0.2435091 0.21378986 0.2534715 0.2548616 0.2204577 0.8059736
## [2,] 0.1163004 0.13758030 0.2057558 0.1959547 0.1182565 0.8006784
## [3,] 0.2480159 0.34576858 0.2533172 0.2690490 0.2473293 0.8265421
## [4,] 0.1536990 0.09708683 0.1366534 0.1845413 0.1083211 0.7881420
## [5,] 0.6950754 0.60269720 0.6690452 0.6503435 0.5242666 0.2656988
## [6,] 0.4551689 0.30091318 0.3345999 0.3149783 0.2975698 0.8674939
##      TC-prim-D2
## [1,]  0.8365404
## [2,]  0.8469494
## [3,]  0.8259862
## [4,]  0.8473733
## [5,]  0.1958783
## [6,]  0.8668086

PCA and heatmaps on DMRs

Exercise: Create a heatmap based on the beta values we just calculated:

The DMRs are based on the comparison between BC-prim and TC-prim-D2. We can see a clear clustering between B cells vs PBMC and T cells, but we don’t see a substructure in B cells treated / untreated at this point.

Creating a PCA on the data tells us a similar picture:

library(ggrepel)
model_meth_seq <- prcomp(na.omit(as.matrix(meth_values)))
ggplot(as.data.frame(model_meth_seq$rotation), aes(x=PC1, y=PC2)) +
      geom_point() +
      geom_text_repel(data=as.data.frame(model_meth_seq$rotation), aes(label=rownames(model_meth_seq$rotation)))

Public WGBS + RNA-seq dataset

ExperimentHub provides access to ready to use R objects which have most often been preprocessed. These can be large files which have been made publicly available. We first load the necessary ExperimentHub libraries:

library(ExperimentHub)

We create an ExperimentHub object to interact with the ExperimentHub web service:

eh <- ExperimentHub()

Which packages are contained in ExperimentHub?

unique(package(eh))
##  [1] "GSE62944"               "alpineData"            
##  [3] "CellMapperData"         "HumanAffyData"         
##  [5] "curatedMetagenomicData" "SeqSQC"                
##  [7] "restfulSEData"          "curatedTCGAData"       
##  [9] "HarmonizedTCGAData"     "HMP16SData"            
## [11] "TENxBrainData"          "MetaGxOvarian"         
## [13] "CLLmethylation"         "tissueTreg"            
## [15] "MetaGxBreast"           "FlowSorted.Blood.EPIC" 
## [17] "MetaGxPancreas"

What is behind “curatedTCGAData”? Long list, limit to first 20 entries.

listResources(eh, "curatedTCGAData")[1:20]
##  [1] "ACC_CNASNP-20160128"                   
##  [2] "ACC_CNVSNP-20160128"                   
##  [3] "ACC_colData-20160128"                  
##  [4] "ACC_GISTIC_AllByGene-20160128"         
##  [5] "ACC_GISTIC_ThresholdedByGene-20160128" 
##  [6] "ACC_metadata-20160128"                 
##  [7] "ACC_miRNASeqGene-20160128"             
##  [8] "ACC_Mutation-20160128"                 
##  [9] "ACC_RNASeq2GeneNorm-20160128"          
## [10] "ACC_RPPAArray-20160128"                
## [11] "ACC_sampleMap-20160128"                
## [12] "BLCA_CNASeq-20160128"                  
## [13] "BLCA_CNASNP-20160128"                  
## [14] "BLCA_CNVSNP-20160128"                  
## [15] "BLCA_colData-20160128"                 
## [16] "BLCA_GISTIC_AllByGene-20160128"        
## [17] "BLCA_GISTIC_ThresholdedByGene-20160128"
## [18] "BLCA_metadata-20160128"                
## [19] "BLCA_miRNASeqGene-20160128"            
## [20] "BLCA_Mutation-20160128"

What is behind tissueTreg?

listResources(eh, "tissueTreg")
## [1] "Bisulfite sequencing data from tissue Tregs (per sample)"          
## [2] "Bisulfite sequencing data from tissue Tregs (per tissue/cell type)"
## [3] "RNA-seq data from tissue Tregs (RPKM values)"                      
## [4] "RNA-seq data from tissue Tregs (htseq values)"

We are going to work with the first three datasets, we will load and store them under more ‘human’ readable names.

Load the RPKM values:

rpkms <- eh[["EH1074"]]

Load the whole genome bisulfite sequencing data as bsseq objects:

tregs_per_sample <- eh[["EH1072"]]
tregs_per_tissue <- eh[["EH1073"]]